/* File: build_matched_regression_datasets.do
 * Author: Luca Maini
 * Purpose: creates datasets for matched-cohort analysis
 * Output: 
 *
 * Date Created: 01/23/2023
 *
 */


/*	This file merges the matched cohorts to outcome data. The script also
	selects matched cohorts that do not satisfy requirements for inclusion. The 
	criteria for inclusion are:
	
		- 	At least 5 matched controls in the cohort
		- 	Treated unit AND controls all have data for the relevant outcomes in
			the year before and after the acquisition (missing data in the year 
			of acquisition is okay, though in practice should never happen)
*/

////////////////////////////////////////////////////////
////												////
////	PART 0. Define program to build the data	////
////												////
////////////////////////////////////////////////////////

program define matchCohortData
	
	args vars threshold name
	
	/* Arguments:
		
		vars 			--> variables that need to be matched (usually split sales/coverage)
		outcomeData 	--> suffix to attach to data name for saving purposes
	*/
	
	use "${maindir}\combined_regression_dataset_with_valeant_augmented.dta", clear
	keep Product year `vars'
	 
	tempfile outcomeData
	qui save `outcomeData', replace

	*** Step 2: open dataset of treated and matched controls
	qui use "${outdir}\matched_cohorts_all.dta", clear

	* Merge with outcome data to determine which drugs have enough data to be 
	* included (must have data in the year before and after the event)

	* Merge to price data
	rename year yearAcq
	qui joinby Product using `outcomeData'	// , unmatched(master) // there should be no unmatched

	* make sure data for each variable exists before and after the acquisition year
	foreach var of varlist `vars' {
		qui bysort Treated_Product yearAcq cem_varlist Product (year) : ///
			egen has_event_minus1 = max(year == yearAcq - 1 & `var' != .)
		qui bysort Treated_Product yearAcq cem_varlist Product (year) : ///
			egen has_event_plus1 = max(year == yearAcq + 1 & `var' != .)
		
		qui gen in_sample_`var' = has_event_minus1 * has_event_plus1
		
		drop has_event_minus1 has_event_plus1
		}

	* flag treated products and associated cohorts if the treated product is not
	* in sample for any of the variables
	qui gen all_in_sample_treated = treated
	foreach var of varlist `vars' {
		qui replace all_in_sample_treated = 0 if in_sample_`var' == 0
		}

	qui bysort Treated_Product yearAcq cem_varlist : egen keepAll = max(all_in_sample_treated)
	qui drop if keepAll == 0

	* Now drop individual matched controls that have all missing samples
	qui gen all_in_sample = 1
	foreach var of varlist `vars' {
		qui replace all_in_sample = 0 if in_sample_`var' == 0
		}

	qui keep if all_in_sample == 1
	drop all_in_sample_treated keepAll all_in_sample in_sample_*

	* Save dataset before running regressions
	qui save "${maindir}\matched_cohorts_`name'.dta", replace
end


////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////


////////////////////////////////////////////////////////////////////
////															////
////	 PART 1. Create matched datasets for sales data			////
////															////
////////////////////////////////////////////////////////////////////

* Acquired drugs
matchCohortData `"log_WAC log_net w_log_net log_units log_sales"' ///
				5 ///
				"sales"

////////////////////////////////////////////////////////////////////////
////																////
////	 PART 2. Create matched datasets for coverage data			////
////																////
////////////////////////////////////////////////////////////////////////

* Acquired drugs
matchCohortData `"frcov* frunr* frpre* glp*"' ///
				5 ///
				"coverage"
